Data Cleaning

Author

Shonushka Sawant

mmr_cov <- read.csv("USA/data/all/measles-USA-by-mmr-coverage.csv")
head(mmr_cov)
   geography school_year estimate_pct population_size percent_surveyed
1    Alabama     2023-24        93.8%           54565            100.0
2     Alaska     2023-24        84.3%            8644             88.9
3    Arizona     2023-24        89.3%           74834             99.6
4   Arkansas     2023-24        92.5%           37535             95.4
5 California     2023-24        96.2%          569680            100.0
6   Colorado     2023-24        88.3%           61662            100.0
                          survey_type    categories
1                              Census      90-94.9%
2 Census (pub.), Not Conducted (prv.) Less than 90%
3                              Census Less than 90%
4 Census (pub.), Vol. Response (prv.)      90-94.9%
5                              Census          95%+
6                              Census Less than 90%
drop_na.tbl_ts <- function(ts)  tsibble::as_tsibble(tidyr:::drop_na.data.frame(ts))
#drop rows with NA for vaccine percentage
mmr_cov <- mmr_cov %>% drop_na(estimate_pct)

#create a column with percent cast to numeric
mmr_cov$num_pct <- mmr_cov$estimate_pct

mmr_cov$num_pct <- as.numeric(sub("%","",mmr_cov$num_pct))/100
#summary statistics for vaccine coverage, per state
group_by(mmr_cov, geography) %>% 
  get_summary_stats(num_pct)
# A tibble: 53 × 14
   geography     variable     n   min   max median    q1    q3   iqr   mad  mean
   <chr>         <fct>    <dbl> <dbl> <dbl>  <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
 1 Alabama       num_pct     14 0.866 0.949  0.935 0.927 0.939 0.011 0.009 0.929
 2 Alaska        num_pct      8 0.78  0.944  0.903 0.841 0.929 0.088 0.054 0.884
 3 Arizona       num_pct     14 0.893 0.95   0.937 0.921 0.942 0.021 0.013 0.929
 4 Arkansas      num_pct     14 0.859 0.983  0.922 0.911 0.94  0.029 0.025 0.919
 5 California    num_pct     14 0.923 0.973  0.957 0.933 0.965 0.032 0.018 0.95 
 6 Colorado      num_pct     13 0.817 0.911  0.873 0.869 0.884 0.015 0.015 0.875
 7 Connecticut   num_pct     14 0.953 0.985  0.97  0.963 0.973 0.01  0.007 0.968
 8 Delaware      num_pct     13 0.897 0.985  0.964 0.957 0.976 0.019 0.018 0.96 
 9 District of … num_pct      6 0.789 0.92   0.88  0.834 0.895 0.062 0.044 0.865
10 Florida       num_pct     14 0.881 0.941  0.932 0.918 0.936 0.018 0.009 0.925
# ℹ 43 more rows
# ℹ 3 more variables: sd <dbl>, se <dbl>, ci <dbl>
head(mmr_cov)
   geography school_year estimate_pct population_size percent_surveyed
1    Alabama     2023-24        93.8%           54565            100.0
2     Alaska     2023-24        84.3%            8644             88.9
3    Arizona     2023-24        89.3%           74834             99.6
4   Arkansas     2023-24        92.5%           37535             95.4
5 California     2023-24        96.2%          569680            100.0
6   Colorado     2023-24        88.3%           61662            100.0
                          survey_type    categories num_pct
1                              Census      90-94.9%   0.938
2 Census (pub.), Not Conducted (prv.) Less than 90%   0.843
3                              Census Less than 90%   0.893
4 Census (pub.), Vol. Response (prv.)      90-94.9%   0.925
5                              Census          95%+   0.962
6                              Census Less than 90%   0.883
#summary statistics for vaccine coverage, per year
summ_year <- group_by(mmr_cov, school_year) %>% 
  get_summary_stats(num_pct)

summ_year
# A tibble: 14 × 14
   school_year variable     n   min   max median    q1    q3   iqr   mad  mean
   <chr>       <fct>    <dbl> <dbl> <dbl>  <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
 1 2009-10     num_pct     43 0.845 0.997  0.945 0.919 0.974 0.054 0.042 0.941
 2 2011-12     num_pct     44 0.868 0.993  0.948 0.928 0.969 0.041 0.031 0.944
 3 2012-13     num_pct     46 0.857 0.999  0.945 0.922 0.963 0.041 0.029 0.938
 4 2013-14     num_pct     47 0.817 0.997  0.947 0.924 0.965 0.041 0.031 0.938
 5 2014-15     num_pct     50 0.869 0.992  0.94  0.919 0.964 0.045 0.033 0.938
 6 2015-16     num_pct     51 0.871 0.994  0.946 0.925 0.963 0.037 0.027 0.942
 7 2016-17     num_pct     48 0.873 0.994  0.94  0.927 0.963 0.035 0.03  0.942
 8 2017-18     num_pct     49 0.887 0.994  0.943 0.926 0.964 0.038 0.028 0.943
 9 2018-19     num_pct     49 0.874 0.992  0.942 0.928 0.962 0.034 0.025 0.942
10 2019-20     num_pct     48 0.866 0.991  0.946 0.931 0.963 0.033 0.025 0.945
11 2020-21     num_pct     50 0.789 0.989  0.937 0.907 0.954 0.047 0.03  0.929
12 2021-22     num_pct     52 0.78  0.986  0.929 0.906 0.956 0.05  0.036 0.922
13 2022-23     num_pct     52 0.813 0.984  0.921 0.9   0.951 0.051 0.035 0.922
14 2023-24     num_pct     52 0.796 0.983  0.921 0.897 0.942 0.046 0.033 0.921
# ℹ 3 more variables: sd <dbl>, se <dbl>, ci <dbl>
#plot median vaccination rate by year

summ_year %>%
  tail(10) %>%
  ggplot( aes(x=school_year, y=median)) +
    geom_line(color="black") +
    geom_point()
`geom_line()`: Each group consists of only one observation.
ℹ Do you need to adjust the group aesthetic?

Median vaccination rates among states in the USA appear to be showing a downward trend starting from the 2019-20200 school year. We will now look at the trajectories for the states with the lowest vaccination rates in 2023-2024.

mmr_cov2023 <- mmr_cov %>% filter(school_year == "2023-24")
head(mmr_cov2023 %>% arrange(num_pct))
  geography school_year estimate_pct population_size percent_surveyed
1     Idaho     2023-24        79.6%           22376            100.0
2    Alaska     2023-24        84.3%            8644             88.9
3 Wisconsin     2023-24        84.8%           62028             98.2
4 Minnesota     2023-24        87.0%           66032             99.1
5   Florida     2023-24        88.1%          228213            100.0
6  Colorado     2023-24        88.3%           61662            100.0
                          survey_type    categories num_pct
1                              Census Less than 90%   0.796
2 Census (pub.), Not Conducted (prv.) Less than 90%   0.843
3                              Census Less than 90%   0.848
4                              Census Less than 90%   0.870
5                              Census Less than 90%   0.881
6                              Census Less than 90%   0.883

The states with the five lowest vaccination rates are Idaho (79.6%), Alaska (84.3%), Wisconsin (84.8%), Minnesota (87.0%), and Florida (88.1%).

Compare these rates to the 2009-10 school year:

low_2023 <- c("Idaho", "Alaska", "Wisconsin", "Minnesota", "Florida")
mmr_cov2009 <- mmr_cov %>% filter(geography %in% low_2023, school_year == "2009-10")
mmr_cov2009 %>% arrange(num_pct)
  geography school_year estimate_pct population_size percent_surveyed
1     Idaho     2009-10        87.0%           22624            100.0
2   Florida     2009-10        91.3%          218630            100.0
3 Wisconsin     2009-10        94.2%           61095              2.3
4 Minnesota     2009-10        95.1%           70653            100.0
    survey_type    categories num_pct
1        Census Less than 90%   0.870
2        Census      90-94.9%   0.913
3 Random Sample      90-94.9%   0.942
4        Census          95%+   0.951

Vaccination rates for these states were notably higher in 2009-10 (excepting Alaska, which did not report its vaccination rate that year.)

mmr_cov_low <- mmr_cov %>% filter(geography %in% low_2023)
ggplot(mmr_cov_low, aes(geography, num_pct, fill = geography)) +
  geom_boxplot() +
  geom_jitter(width = 0.2) +
  guides(fill = "none") +
  labs(
    x = "", 
    y = "Vaccination Rate",
    title = "Vaccination Variation by State (2023-2024)",
    subtitle = "States with Lowest Vaccination Rates"
  ) +
  theme(plot.title = element_text(hjust = 0.5, size = 14, face = "bold"),
        plot.subtitle = element_text(hjust = 0.5, size = 12))

Wisconsin and Alaska show the greatest variation in vaccination rates, of the five states with the lowest rates in 2023-24.

#at this point we will add a numeric column for the dates.
mmr_cov$num_years <- substr(mmr_cov$school_year, 1, 4)
mmr_cov$num_years <- as.numeric(mmr_cov$num_years)

head(mmr_cov)
   geography school_year estimate_pct population_size percent_surveyed
1    Alabama     2023-24        93.8%           54565            100.0
2     Alaska     2023-24        84.3%            8644             88.9
3    Arizona     2023-24        89.3%           74834             99.6
4   Arkansas     2023-24        92.5%           37535             95.4
5 California     2023-24        96.2%          569680            100.0
6   Colorado     2023-24        88.3%           61662            100.0
                          survey_type    categories num_pct num_years
1                              Census      90-94.9%   0.938      2023
2 Census (pub.), Not Conducted (prv.) Less than 90%   0.843      2023
3                              Census Less than 90%   0.893      2023
4 Census (pub.), Vol. Response (prv.)      90-94.9%   0.925      2023
5                              Census          95%+   0.962      2023
6                              Census Less than 90%   0.883      2023
#fitting linear mixed effects model.
#this is inaccurate, just for practice with lmer for now.
library(lme4)
Warning: package 'lme4' was built under R version 4.4.2
Loading required package: Matrix

Attaching package: 'Matrix'
The following objects are masked from 'package:tidyr':

    expand, pack, unpack

Attaching package: 'lme4'
The following object is masked from 'package:fabletools':

    refit
lin_0 <- lmer(num_years ~ 1 + (1 | geography), data = mmr_cov)
boundary (singular) fit: see help('isSingular')
summary(lin_0)
Linear mixed model fit by REML ['lmerMod']
Formula: num_years ~ 1 + (1 | geography)
   Data: mmr_cov

REML criterion at convergence: 3861.8

Scaled residuals: 
     Min       1Q   Median       3Q      Max 
-1.85230 -0.88142  0.08946  0.81762  1.54578 

Random effects:
 Groups    Name        Variance Std.Dev.
 geography (Intercept)  0.00    0.00    
 Residual              16.97    4.12    
Number of obs: 681, groups:  geography, 53

Fixed effects:
             Estimate Std. Error t value
(Intercept) 2016.6314     0.1579   12773
optimizer (nloptwrap) convergence code: 0 (OK)
boundary (singular) fit: see help('isSingular')
library(Epi)

Attaching package: 'Epi'
The following object is masked from 'package:lme4':

    factorize
ci.lin(lin_0)
            Estimate   StdErr        z P     2.5%    97.5%
(Intercept) 2016.631 0.157878 12773.35 0 2016.322 2016.941
mmr_cov %>%
  filter(geography %in% low_2023) %>%
  ggplot(aes(school_year, num_pct, color = geography, group = geography)) +
  geom_line() + geom_point() +
  labs(title = "Trends in Vaccination Rates (2015-2024)",
       y = "Vaccination rate", x = "Year")

state_vax <- mmr_cov %>%
  filter(school_year == "2023-24") %>%
  group_by(state = geography) %>%
  summarize(mean_vax = mean(num_pct))

plot_usmap(data = state_vax, values = "mean_vax", color = "white") +
  scale_fill_continuous(low = "red", high = "green", name = "Vaccination %") +
  labs(title = "Measles Vaccination Rates by State, 2023") +
  theme(legend.postion = "right")
Warning in plot_theme(plot): The `legend.postion` theme element is not defined
in the element hierarchy.

state_lookup <- tibble(
  state = state.name,
  abbr  = state.abb
)

vax_yearly <- mmr_cov %>%
  mutate(year = as.integer(substr(school_year, 1, 4))) %>%
  group_by(state = geography, year) %>%
  summarize(mean_vax = mean(num_pct, na.rm = TRUE), .groups = "drop") %>%
  left_join(state_lookup, by = "state") %>%
  filter(!is.na(abbr))  


fig_map <- plot_geo(
    vax_yearly, 
    locationmode = 'USA-states'
  ) %>%
  add_trace(
    z         = ~mean_vax,
    locations = ~abbr,
    frame     = ~year,
    colorscale= 'RdYlGn',
    reversescale = TRUE,
    marker    = list(line = list(color = 'white', width = 0.5)),
    hovertemplate = paste0(
      "%{location}<br>",
      "Year: %{frame}<br>",
      "Vaccination: %{z:.1%}<extra></extra>"
    )
  ) %>%
  layout(
    title = list(text = "Measles Vaccination Rates by State (2009–2023)", x = 0.5),
    geo   = list(scope = 'usa'),
    updatemenus = list(
      list(
        type    = "buttons",
        x       = 1.1,  y = 0,
        showactive = FALSE,
        buttons = list(
          list(method = "animate",
               args   = list(NULL, 
                             list(frame = list(duration = 500, redraw = FALSE),
                                  transition = list(duration = 0),
                                  fromcurrent = TRUE,
                                  mode = "immediate")),
               label  = "▶ Play")
        )
      )
    )
  )

fig_map  # this will render inline in RMarkdown
cum_df <- vax_yearly %>%
  arrange(state, year) %>%
  group_by(state) %>%
  mutate(cum_avg = cummean(mean_vax))

fig_cum <- plot_ly(
    cum_df,
    x = ~year,
    y = ~cum_avg,
    color = ~state,
    type  = 'scatter',
    mode  = 'lines',
    hoverinfo = 'text',
    text = ~paste0(state, ": ", scales::percent(cum_avg, accuracy = 0.1))
  ) %>%
  layout(
    title = "Cumulative Average Vaccination Rate by State",
    xaxis = list(title = "Year"),
    yaxis = list(title = "Cumulative Avg (%)")
  )

fig_cum
print(fig_map)
print(fig_cum)
Warning in RColorBrewer::brewer.pal(N, "Set2"): n too large, allowed maximum for palette Set2 is 8
Returning the palette you asked for with that many colors
Warning in RColorBrewer::brewer.pal(N, "Set2"): n too large, allowed maximum for palette Set2 is 8
Returning the palette you asked for with that many colors
ts_vax <- mmr_cov %>%
  filter(geography %in% low_2023) %>%
  mutate(
    year = as.integer(substr(school_year, 1, 4))
  ) %>%
  group_by(year) %>%
  summarize(mean_vax = mean(num_pct, na.rm = TRUE)) %>%
  ungroup() %>%
  # explicitly fill any missing year between min and max
  complete(year = seq(min(year), max(year), by = 1)) %>%
  as_tsibble(index = year)

# now every year in the span is present (missing mean_vax will be NA)
fit <- ts_vax %>% model(ETS(mean_vax))
Warning: 1 error encountered for ETS(mean_vax)
[1] ETS does not support missing values.
fc  <- forecast(fit, h = 2)

autoplot(ts_vax, mean_vax) +
  autolayer(fc, .mean) +
  labs(
    title = "Forecasted Vaccination Rates (2025–2026)",
    x = "Year",
    y = "Mean Vaccination Rate"
  )
Warning in max(ids, na.rm = TRUE): no non-missing arguments to max; returning
-Inf
Warning in max(ids, na.rm = TRUE): no non-missing arguments to max; returning
-Inf
Warning: Removed 2 rows containing missing values or values outside the scale range
(`geom_line()`).

# build a plain ts-series with frequency = 1 (annual)
vax_ts <- ts(
  data      = ts_vax$mean_vax,
  start     = min(ts_vax$year),
  frequency = 1
)

fc <- forecast(ets(vax_ts), h = 2)
Warning in ets(vax_ts): Missing values encountered. Using longest contiguous
portion of time series
autoplot(fc) +
  labs(
    title = "Forecasted Vaccination Rates (2025–2026)",
    x     = "Year",
    y     = "Mean Vaccination Rate"
  )

library(tsibble)
library(fable)
library(dplyr)
library(tidyr)

ts_vax_fable <- mmr_cov %>%
  filter(geography %in% low_2023) %>%
  mutate(year = as.integer(substr(school_year,1,4))) %>%
  group_by(year) %>%
  summarize(mean_vax = mean(num_pct, na.rm = TRUE)) %>%
  ungroup() %>%
  complete(year = seq(min(year), max(year), by = 1)) %>%
  as_tsibble(index = year) %>%
  # simple linear interpolation for the missing years
  mutate(mean_vax = approx(year[!is.na(mean_vax)],
                           mean_vax[!is.na(mean_vax)],
                           xout = year)$y)

fit_fbl <- ts_vax_fable %>% model(ETS(mean_vax))
fc_fbl  <- forecast(fit_fbl, h = 2)

autoplot(fc_fbl) +
  labs(title = "Forecasted Vaccination Rates (2025–2026)")

library(dplyr)
library(tsibble)
library(tidyr)
library(fable)
library(ggplot2)

# 1) Build one annual series per state, and fill implicit gaps
ts_states <- mmr_cov %>%
  mutate(year = as.integer(substr(school_year, 1, 4))) %>%
  group_by(geography, year) %>%
  summarize(mean_vax = mean(num_pct, na.rm = TRUE), .groups="drop") %>%
  as_tsibble(key = geography, index = year) %>%
  # make every year explicit (2009:2023), with NA where missing
  fill_gaps() %>%
  # simple linear interpolation of those NAs (you could also choose carry-forward, etc.)
  group_by_key() %>%
  mutate(
    mean_vax = approx(
      x    = year[!is.na(mean_vax)],
      y    = mean_vax[!is.na(mean_vax)],
      xout = year,
      rule = 2
    )$y
  ) %>%
  ungroup()

# 2) Fit ETS to each state
models <- ts_states %>%
  model(ETS = ETS(mean_vax))

# 3) Forecast two years ahead
fc_states <- models %>%
  forecast(h = 2)

autoplot(fc_states) +
  labs(
    title = "Vaccination Trends (2009–2023) and Forecast (2025–26)",
    x     = "Year",
    y     = "Mean Vaccination Rate"
  ) +
  facet_wrap(~ geography) +
  theme_minimal()

fc_states %>%
  as_tibble() %>%
  filter(year >= 2025) %>%
  select(
    geography,
    year,
    point = .mean
  )
# A tibble: 52 × 3
   geography             year point
   <chr>                <dbl> <dbl>
 1 Alabama               2025 0.928
 2 Alaska                2025 0.843
 3 Arizona               2025 0.879
 4 Arkansas              2025 0.925
 5 California            2025 0.962
 6 Colorado              2025 0.882
 7 Connecticut           2025 0.977
 8 Delaware              2025 0.938
 9 District of Columbia  2025 0.854
10 Florida               2025 0.841
# ℹ 42 more rows

Adding graphs for how each region has changed from 2009 to 2023.

#at this point we will add a numeric column for the dates.
mmr_cov$first_year <- substr(mmr_cov$school_year, 1, 4)
mmr_cov$num_years <- as.numeric(mmr_cov$first_year)

head(mmr_cov)
   geography school_year estimate_pct population_size percent_surveyed
1    Alabama     2023-24        93.8%           54565            100.0
2     Alaska     2023-24        84.3%            8644             88.9
3    Arizona     2023-24        89.3%           74834             99.6
4   Arkansas     2023-24        92.5%           37535             95.4
5 California     2023-24        96.2%          569680            100.0
6   Colorado     2023-24        88.3%           61662            100.0
                          survey_type    categories num_pct num_years
1                              Census      90-94.9%   0.938      2023
2 Census (pub.), Not Conducted (prv.) Less than 90%   0.843      2023
3                              Census Less than 90%   0.893      2023
4 Census (pub.), Vol. Response (prv.)      90-94.9%   0.925      2023
5                              Census          95%+   0.962      2023
6                              Census Less than 90%   0.883      2023
  first_year
1       2023
2       2023
3       2023
4       2023
5       2023
6       2023
new_england <- c("Connecticut", "Maine", "Massachusetts", " New Hampshire",
                 " Rhode Island", "Vermont")
middle_atlantic <- c("New Jersey", "New York", "Pennsylvania")

east_north_central <- c("Indiana", "Illinois", "Michigan", "Ohio", "Wisconsin")
west_north_central <- c("Iowa", "Kansas", "Missouri", "Minnesota", "Nebraska",
                        "North Dakota", "South Dakota")

south_atlantic <- c("Delaware", " District of Columbia", "Florida", "Georgia",
                    "Maryland", " North Carolina",  "South Carolina", "Virginia",
                    "West Virginia")
east_south_central <- c("Alabama", "Kentucky", "Mississippi", "Tennessee")
west_south_central <- c("Arkansas", "Louisiana", "Oklahoma", "Texas")

mountain <- c("Arizona", "Colorado", "Idaho", "New Mexico", "Montana",
              "Utah", "Nevada", "Wyoming")
pacific <- c("Alaska", "California", "Hawaii", "Oregon", "Washington")
#rates for new england
mmr_cov %>%
  filter(geography %in% new_england) %>%
  ggplot(aes(first_year, num_pct, color = geography, group = geography)) +
  geom_line() + geom_point() +
  labs(title = "Trends in Vaccination Rates (2009-2024): New England",
       y = "Vaccination rate", x = "Year")

#rates for middle atlantic
mmr_cov %>%
  filter(geography %in% middle_atlantic) %>%
  ggplot(aes(first_year, num_pct, color = geography, group = geography)) +
  geom_line() + geom_point() +
  labs(title = "Trends in Vaccination Rates (2009-2024): Middle Atlantic",
       y = "Vaccination rate", x = "Year")

#rates for east north central
mmr_cov %>%
  filter(geography %in% east_north_central) %>%
  ggplot(aes(first_year, num_pct, color = geography, group = geography)) +
  geom_line() + geom_point() +
  labs(title = "Trends in Vaccination Rates (2009-2024: East North Central)",
       y = "Vaccination rate", x = "Year")

mmr_cov %>%
  filter(geography %in% west_north_central) %>%
  ggplot(aes(first_year, num_pct, color = geography, group = geography)) +
  geom_line() + geom_point() +
  labs(title = "Trends in Vaccination Rates (2009-2024): West North Central",
       y = "Vaccination rate", x = "Year")

mmr_cov %>%
  filter(geography %in% south_atlantic) %>%
  ggplot(aes(first_year, num_pct, color = geography, group = geography)) +
  geom_line() + geom_point() +
  labs(title = "Trends in Vaccination Rates (2009-2024): South Atlantic",
       y = "Vaccination rate", x = "Year")

mmr_cov %>%
  filter(geography %in% east_south_central) %>%
  ggplot(aes(first_year, num_pct, color = geography, group = geography)) +
  geom_line() + geom_point() +
  labs(title = "Trends in Vaccination Rates (2009-2024): East South Central",
       y = "Vaccination rate", x = "Year")

mmr_cov %>%
  filter(geography %in% west_south_central) %>%
  ggplot(aes(first_year, num_pct, color = geography, group = geography)) +
  geom_line() + geom_point() +
  labs(title = "Trends in Vaccination Rates (2009-2024): West South Central",
       y = "Vaccination rate", x = "Year")

mmr_cov %>%
  filter(geography %in% mountain) %>%
  ggplot(aes(first_year, num_pct, color = geography, group = geography)) +
  geom_line() + geom_point() +
  labs(title = "Trends in Vaccination Rates (2009-2024): Mountain",
       y = "Vaccination rate", x = "Year")

mmr_cov %>%
  filter(geography %in% pacific) %>%
  ggplot(aes(first_year, num_pct, color = geography, group = geography)) +
  geom_line() + geom_point() +
  labs(title = "Trends in Vaccination Rates (2009-2024): Pacific",
       y = "Vaccination rate", x = "Year")